rm(list=ls(all=TRUE))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Read sample metadata (without pool and blank samples):
sample_info <- read_tsv("Data/sample_metadata.tsv")
## Rows: 74 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): Sample_Identifier, SampleCollectionDateandTime, Metabolomics_Data,...
## dbl (3): AgeInYears, Sebumeter_Score, Skicon_Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Read sequence and position:
sample_info_seq_pos <- read_tsv("Data/sample_info_seq_pos.tsv")
## Rows: 110 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## dbl (2): Sequence, Position
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Read metabolomics datasets:
data_C18_neg <- read_csv("Data/Dataset_C18_neg.csv")
## Rows: 6137 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): best ion, auto MS2 verify, partners
## dbl (117): row ID, row m/z, row retention time, correlation group ID, annota...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_C18_pos <- read_csv("Data/Dataset_C18_pos.csv")
## Rows: 15089 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (113): row ID, row m/z, row retention time, EMR_04_10_MD, EMR_04_11_AT, ...
## lgl (7): correlation group ID, annotation network number, best ion, auto M...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_HILIC_neg <- read_csv("Data/Dataset_HILIC_neg.csv")
## Rows: 11230 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (114): row ID, row m/z, row retention time, correlation group ID, EMR_04...
## lgl (6): annotation network number, best ion, auto MS2 verify, identified ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_HILIC_pos <- read_csv("Data/Dataset_HILIC_pos.csv")
## Rows: 28762 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): best ion
## dbl (118): row ID, row m/z, row retention time, correlation group ID, annota...
## lgl (1): auto MS2 verify
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Combine datasets:
data_wide <- bind_rows(
data_C18_neg %>%
mutate(Dataset = "C18_neg", Dataset_Prefix = "X95") %>%
select(
Dataset, Dataset_Prefix,
Row_ID = `row ID`, Row_MZ = `row m/z`, Row_RT = `row retention time`,
Correlation_Group_ID = `correlation group ID`,
starts_with("EMR")
),
data_C18_pos %>%
mutate(Dataset = "C18_pos", Dataset_Prefix = "X94") %>%
select(
Dataset, Dataset_Prefix,
Row_ID = `row ID`, Row_MZ = `row m/z`, Row_RT = `row retention time`,
Correlation_Group_ID = `correlation group ID`,
starts_with("EMR")
),
data_HILIC_neg %>%
mutate(Dataset = "HILIC_neg", Dataset_Prefix = "X97") %>%
select(
Dataset, Dataset_Prefix,
Row_ID = `row ID`, Row_MZ = `row m/z`, Row_RT = `row retention time`,
Correlation_Group_ID = `correlation group ID`,
starts_with("EMR")
),
data_HILIC_pos %>%
mutate(Dataset = "HILIC_pos", Dataset_Prefix = "X96") %>%
select(
Dataset, Dataset_Prefix,
Row_ID = `row ID`, Row_MZ = `row m/z`, Row_RT = `row retention time`,
Correlation_Group_ID = `correlation group ID`,
starts_with("EMR")
)
) %>%
mutate(
Feature_ID = str_c(Dataset_Prefix, Row_ID %>% formatC(width = 5, format = "d", flag = "0"))
) %>%
select(-Dataset_Prefix) %>% relocate(Feature_ID)
data_wide